{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huggingface.co/datasets/mesolitica/crawl-astroawani/resolve/main/berita-dunia-v2.json.nested\n",
    "# !wget https://huggingface.co/datasets/mesolitica/crawl-astroawani/resolve/main/berita-hiburan-v2.json.nested\n",
    "# !wget https://huggingface.co/datasets/mesolitica/crawl-astroawani/resolve/main/berita-malaysia-v2.json.nested\n",
    "# !wget https://huggingface.co/datasets/mesolitica/crawl-astroawani/resolve/main/berita-politik-v2.json.nested\n",
    "# !wget https://huggingface.co/datasets/mesolitica/crawl-astroawani/resolve/main/berita-sukan-v2.json.nested\n",
    "# !wget https://huggingface.co/datasets/mesolitica/crawl-astroawani/resolve/main/berita-teknologi-v2.json.nested\n",
    "# !wget https://huggingface.co/datasets/mesolitica/crawl-astroawani/resolve/main/gaya-hidup-v2.json.nested"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n",
      "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import malaya\n",
    "from bs4 import BeautifulSoup\n",
    "from glob import glob\n",
    "from tqdm import tqdm\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['berita-dunia-v2.json.nested',\n",
       " 'berita-hiburan-v2.json.nested',\n",
       " 'berita-malaysia-v2.json.nested',\n",
       " 'berita-politik-v2.json.nested',\n",
       " 'berita-sukan-v2.json.nested',\n",
       " 'berita-teknologi-v2.json.nested',\n",
       " 'gaya-hidup-v2.json.nested']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files = sorted(glob('*.json.nested'))\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "berita-dunia-v2.json.nested\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████| 10000/10000 [00:03<00:00, 2707.17it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "berita-hiburan-v2.json.nested\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████| 10000/10000 [00:05<00:00, 1754.35it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "berita-malaysia-v2.json.nested\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████| 10000/10000 [00:04<00:00, 2439.24it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "berita-politik-v2.json.nested\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████| 10000/10000 [00:05<00:00, 1989.35it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "berita-sukan-v2.json.nested\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████| 10000/10000 [00:04<00:00, 2232.22it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "berita-teknologi-v2.json.nested\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 998/998 [00:00<00:00, 2106.05it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "gaya-hidup-v2.json.nested\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████| 5366/5366 [00:03<00:00, 1701.39it/s]\n"
     ]
    }
   ],
   "source": [
    "texts = []\n",
    "topics = defaultdict(set)\n",
    "\n",
    "for f in files:\n",
    "    print(f)\n",
    "    with open(f) as fopen:\n",
    "        data = json.load(fopen)\n",
    "    for d in tqdm(data):\n",
    "        if 'articleBody' not in d['r']['response']:\n",
    "            continue\n",
    "        try:\n",
    "            soup = BeautifulSoup(d['r']['response']['articleBody'], \"lxml\")\n",
    "            text = BeautifulSoup(soup.get_text(separator=\"\\n\").strip(), \"lxml\").get_text(separator=\"\\n\").strip()\n",
    "            d['r']['response']['articleBody'] = text\n",
    "            d['filename'] = f\n",
    "            texts.append(d)\n",
    "            tags = [t for t in d['original']['tags'] if len(t) > 5]\n",
    "            topics[f].update(set(tags))\n",
    "        except Exception as e:\n",
    "            print(e)\n",
    "            pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "topics_keys = set(topics.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_topics = list(topics.values())\n",
    "len(all_topics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "77279"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flat_list = list(set([item for sublist in all_topics for item in sublist]))\n",
    "len(flat_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(True, 0)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'a'.islower(), 'a'.count(' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5094"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rejected = set([i for i in flat_list if i.islower() and i.count(' ') < 1])\n",
    "len(rejected)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "for k in topics_keys:\n",
    "    topics[k] = topics[k] - rejected"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████| 52548/52548 [00:00<00:00, 632912.51it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "54987\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████| 52548/52548 [00:16<00:00, 3143.90it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "118155"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import random\n",
    "\n",
    "X1, X2, Y = [], [], []\n",
    "\n",
    "for d in tqdm(texts):\n",
    "    if random.random() > 0.7:\n",
    "        continue\n",
    "        \n",
    "    t = list(set(d['original']['tags']) - rejected)\n",
    "    if not len(t):\n",
    "        continue\n",
    "    \n",
    "    if random.random() > 0.5:\n",
    "        X1.append(d['original']['title'])\n",
    "        X2.append(f\"topik {random.choice(t)}\")\n",
    "        Y.append(1)\n",
    "    \n",
    "    if random.random() > 0.5:\n",
    "        X1.append(d['original']['description'])\n",
    "        X2.append(f\"topik {random.choice(t)}\")\n",
    "        Y.append(1)\n",
    "    \n",
    "    if random.random() > 0.5:\n",
    "        X1.append(d['r']['response']['articleBody'])\n",
    "        X2.append(f\"topik {random.choice(t)}\")\n",
    "        Y.append(1)\n",
    "        \n",
    "print(len(Y))\n",
    "for d in tqdm(texts):\n",
    "    if random.random() > 0.8:\n",
    "        continue\n",
    "        \n",
    "    t_n = set(random.choice(d['original']['tags']))\n",
    "    t = list(topics[random.choice(list(topics_keys - {d['filename']}))] - t_n)\n",
    "    if not len(t):\n",
    "        continue\n",
    "    \n",
    "    if random.random() > 0.5:\n",
    "        X1.append(d['original']['title'])\n",
    "        X2.append(f\"topik {random.choice(t)}\")\n",
    "        Y.append(0)\n",
    "    \n",
    "    if random.random() > 0.5:\n",
    "        X1.append(d['original']['description'])\n",
    "        X2.append(f\"topik {random.choice(t)}\")\n",
    "        Y.append(0)\n",
    "    \n",
    "    if random.random() > 0.5:\n",
    "        X1.append(d['r']['response']['articleBody'])\n",
    "        X2.append(f\"topik {random.choice(t)}\")\n",
    "        Y.append(0)\n",
    "    \n",
    "len(X1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('topics.json', 'w') as fopen:\n",
    "    json.dump({\n",
    "        'X1': X1,\n",
    "        'X2': X2,\n",
    "        'Y': Y,\n",
    "    }, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
